library(tidyverse)
library(Seurat)
library(patchwork)
library(SingleR)
library(celldex)
library(pheatmap)
# Speeds up scTransform
#BiocManager::install("glmGamPoi")
# Load AR and Tol datasets
AR_data <- CreateSeuratObject(Read10X(data.dir = "~/Desktop/R_projects/Dr_Krummey_Lab/heartTransplant_data/GSE262851_RAW/AR/"), min.cells = 3, min.features = 200)
AR_data$condition <- "AR"
AR_data <- PercentageFeatureSet(AR_data, pattern = "^mt-", col.name = "percent.mt")
VlnPlot(AR_data, features = c("nCount_RNA", "nFeature_RNA", "percent.mt"))
Warning: Default search for "data" layer in "RNA" assay yielded no results; utilizing "counts" layer instead.
# I filtered a bit looser, nFeatureRNA > 5000 vs 2500, percent.mt < 15 vs 5
AR_data <- subset(AR_data, subset = nFeature_RNA > 200 & nFeature_RNA < 5000 & percent.mt < 15)
VlnPlot(AR_data, features = c("nCount_RNA", "nFeature_RNA", "percent.mt"))
Warning: Default search for "data" layer in "RNA" assay yielded no results; utilizing "counts" layer instead.
Tol_data <- CreateSeuratObject(Read10X(data.dir = "~/Desktop/R_projects/Dr_Krummey_Lab/heartTransplant_data/GSE262851_RAW/Tol/"), min.cells = 3, min.features = 200)
Tol_data$condition <- "Tol"
Tol_data <- PercentageFeatureSet(Tol_data, pattern = "^mt-", col.name = "percent.mt")
VlnPlot(Tol_data, features = c("nCount_RNA", "nFeature_RNA", "percent.mt"))
Warning: Default search for "data" layer in "RNA" assay yielded no results; utilizing "counts" layer instead.
Tol_data <- subset(Tol_data, subset = nFeature_RNA > 200 & nFeature_RNA < 5000 & percent.mt < 15)
VlnPlot(Tol_data, features = c("nCount_RNA", "nFeature_RNA", "percent.mt"))
Warning: Default search for "data" layer in "RNA" assay yielded no results; utilizing "counts" layer instead.
# Lets merge asap so we know the same transformations are applied
# Project is just a label to group the merged datasets and I never reference it. Feel free to change!
data.merge <- merge(AR_data, Tol_data, add.cell.ids = c("AR", "Tol"), project = "Heart")
# SCTransform instead of normalize -> variable features -> scale. More up-to-date
data.merge <- SCTransform(data.merge, vars.to.regress = "percent.mt", verbose = FALSE)
data.merge <- RunPCA(data.merge, verbose = FALSE)
# dims of PCA to choose. Probably some fine tuning here for dims and number of neighbors, but I think it looks fine enough already
# It is always good to be thinking whether you are overfitting or underfitting data though!
data.merge = FindNeighbors(data.merge, dims = 1:30, reduction = "pca")
Computing nearest neighbor graph
Computing SNN
data.merge = FindClusters(data.merge, resolution = 0.5, cluster.name = "unintegrated_clusters")
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 5321
Number of edges: 181553
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.8922
Number of communities: 17
Elapsed time: 0 seconds
data.merge <- RunUMAP(data.merge, dims = 1:30, reduction = "pca", reduction.name = "umap.unintegrated", verbose = FALSE)
# Always visualize!
# The plot is showing batch effects, so we definitely need to integrate
DimPlot(data.merge, reduction = "umap.unintegrated", group.by = c("condition", "unintegrated_clusters"), combine = FALSE)
[[1]]
[[2]]
# Now we integrate the two layers using the Harmony algorithm with the PCA we calculated
data.integrated <- IntegrateLayers(object = data.merge, method = HarmonyIntegration, orig.reduction = "pca", new.reduction = "harmony", normalization.method = "SCT", verbose = FALSE)
Warning: HarmonyMatrix is deprecated and will be removed in the future from the API in the future
# After integrating we re-find our neighbors and clusters and run UMAP to check our batch effects visually
data.integrated <- FindNeighbors(data.integrated, reduction = "harmony", dims = 1:30)
Computing nearest neighbor graph
Computing SNN
data.integrated <- FindClusters(data.integrated, resolution = 0.5, cluster.name = "harmony_clusters")
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 5321
Number of edges: 188646
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.8902
Number of communities: 14
Elapsed time: 0 seconds
data.integrated <- RunUMAP(data.integrated, reduction = "harmony", dims = 1:30, reduction.name = "umap.harmony", verbose = FALSE)
# Much nicer, AR and TOL form well-mixed clusters
DimPlot(data.integrated, reduction = "umap.harmony", group.by = c("condition", "harmony_clusters"), combine = FALSE)
[[1]]
[[2]]
# I would suggest looking into Azimuth instead of SingleR. Better fine labeling
ref <- celldex::ImmGenData()
data.integrated_fine_pred <- SingleR(test = GetAssayData(data.integrated, layer = 'counts'),
ref = ref,
labels = ref$label.fine)
data.integrated_main_pred <- SingleR(test = GetAssayData(data.integrated, layer = 'counts'),
ref = ref,
labels = ref$label.main)
data.integrated$singleR.labels.fine <- data.integrated_fine_pred$labels[match(rownames(data.integrated@meta.data), rownames(data.integrated_fine_pred))]
data.integrated$singleR.labels.main <- data.integrated_main_pred$labels[match(rownames(data.integrated@meta.data), rownames(data.integrated_main_pred))]
DimPlot(data.integrated, reduction = "umap.harmony", group.by = "singleR.labels.main")
DimPlot(data.integrated, reduction = "umap.harmony", group.by = "singleR.labels.fine")
head(sort(table(data.integrated_fine_pred$labels), decreasing=TRUE))
Macrophages (MF.11CLOSER.SALM3) Monocytes (MO.6C+II-)
1408 561
T cells (T.8EFF.OT1.D10LIS) Macrophages (MF.103-11B+24-)
298 228
Endothelial cells (BEC) Macrophages (MF.II+480LO)
219 204
head(sort(table(data.integrated_main_pred$labels), decreasing=TRUE))
Macrophages T cells Monocytes NKT
2807 826 352 334
B cells Endothelial cells
271 220
# 826 T cells
# Finding more ways to plot our results, checking annotation of cells and pruned cells
plotScoreHeatmap(data.integrated_main_pred)
plotDeltaDistribution(data.integrated_main_pred)
View(data_frame(data.integrated@meta.data[["singleR.labels.fine"]]))
View(table(data.integrated@meta.data$singleR.labels.fine))
# Exclude CD4 T cells (using negative lookahead regex)
pattern <- "T cells \\((?!CD4).*\\)"
# Shorter way to add metadata
# Need to add perl = TRUE for the regex to be compatible
data.integrated$non_cd4 <- grepl(pattern, data.integrated@meta.data$singleR.labels.fine, perl = TRUE)
DimPlot(data.integrated, reduction = "umap.harmony", group.by = "non_cd4")
non_cd4_subset <- subset(data.integrated, subset = non_cd4 == TRUE)
# Rerun SCTransform on the subset, since the global structure of variance changed
non_cd4_subset <- SCTransform(non_cd4_subset, vars.to.regress = "percent.mt", verbose = FALSE)
Warning: Different cells and/or features from existing assay SCT
# Sub-clustering of non_cd4 cells
non_cd4_subset <- RunPCA(non_cd4_subset, reduction.name = "non_cd4.pca", verbose = FALSE)
Warning: Key ‘PC_’ taken, using ‘noncd4pca_’ instead
non_cd4_subset <- FindNeighbors(non_cd4_subset, reduction = "non_cd4.pca", dims = 1:30)
Computing nearest neighbor graph
Computing SNN
non_cd4_subset <- FindClusters(non_cd4_subset, resolution = 0.5, cluster.name = "non_cd4_clusters")
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 1176
Number of edges: 47342
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.6982
Number of communities: 6
Elapsed time: 0 seconds
non_cd4_subset <- RunUMAP(non_cd4_subset, dims = 1:30, reduction = "non_cd4.pca", verbose = FALSE, reduction.name = "non_cd4.umap")
DimPlot(non_cd4_subset, reduction = "non_cd4.umap", group.by = c("condition", "non_cd4_clusters"), combine = FALSE)
[[1]]
[[2]]
# Finding DE genes
Idents(non_cd4_subset) <- "non_cd4_clusters"
non_cd4_subset <- PrepSCTFindMarkers(non_cd4_subset)
Found 2 SCT models. Recorrecting SCT counts using minimum median counts: 5504
| | 0 % ~calculating
|+++++++++++++++++++++++++ | 50% ~01s
|++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed=01s
tcell.non_cd4.markers <- FindMarkers(non_cd4_subset, ident.1 = 0, ident.2 = 2)
| | 0 % ~calculating
|+ | 1 % ~16s
|+ | 2 % ~16s
|++ | 3 % ~16s
|++ | 4 % ~16s
|+++ | 5 % ~16s
|+++ | 6 % ~16s
|++++ | 7 % ~15s
|++++ | 8 % ~16s
|+++++ | 9 % ~15s
|+++++ | 10% ~15s
|++++++ | 11% ~15s
|++++++ | 12% ~15s
|+++++++ | 13% ~15s
|+++++++ | 14% ~15s
|++++++++ | 15% ~14s
|++++++++ | 16% ~14s
|+++++++++ | 17% ~14s
|+++++++++ | 18% ~14s
|++++++++++ | 19% ~14s
|++++++++++ | 20% ~14s
|+++++++++++ | 21% ~13s
|+++++++++++ | 22% ~13s
|++++++++++++ | 23% ~13s
|++++++++++++ | 24% ~13s
|+++++++++++++ | 25% ~13s
|+++++++++++++ | 26% ~13s
|++++++++++++++ | 27% ~13s
|++++++++++++++ | 28% ~12s
|+++++++++++++++ | 29% ~12s
|+++++++++++++++ | 30% ~12s
|++++++++++++++++ | 31% ~12s
|++++++++++++++++ | 32% ~12s
|+++++++++++++++++ | 33% ~11s
|+++++++++++++++++ | 34% ~11s
|++++++++++++++++++ | 35% ~11s
|++++++++++++++++++ | 36% ~11s
|+++++++++++++++++++ | 37% ~11s
|+++++++++++++++++++ | 38% ~11s
|++++++++++++++++++++ | 39% ~10s
|++++++++++++++++++++ | 40% ~10s
|+++++++++++++++++++++ | 41% ~10s
|+++++++++++++++++++++ | 42% ~10s
|++++++++++++++++++++++ | 43% ~10s
|++++++++++++++++++++++ | 44% ~09s
|+++++++++++++++++++++++ | 45% ~09s
|+++++++++++++++++++++++ | 46% ~09s
|++++++++++++++++++++++++ | 47% ~09s
|++++++++++++++++++++++++ | 48% ~09s
|+++++++++++++++++++++++++ | 49% ~09s
|+++++++++++++++++++++++++ | 50% ~08s
|++++++++++++++++++++++++++ | 51% ~08s
|++++++++++++++++++++++++++ | 52% ~08s
|+++++++++++++++++++++++++++ | 53% ~08s
|+++++++++++++++++++++++++++ | 54% ~08s
|++++++++++++++++++++++++++++ | 55% ~08s
|++++++++++++++++++++++++++++ | 56% ~07s
|+++++++++++++++++++++++++++++ | 57% ~07s
|+++++++++++++++++++++++++++++ | 58% ~07s
|++++++++++++++++++++++++++++++ | 59% ~07s
|++++++++++++++++++++++++++++++ | 60% ~07s
|+++++++++++++++++++++++++++++++ | 61% ~07s
|+++++++++++++++++++++++++++++++ | 62% ~06s
|++++++++++++++++++++++++++++++++ | 63% ~06s
|++++++++++++++++++++++++++++++++ | 64% ~06s
|+++++++++++++++++++++++++++++++++ | 65% ~06s
|+++++++++++++++++++++++++++++++++ | 66% ~06s
|++++++++++++++++++++++++++++++++++ | 67% ~06s
|++++++++++++++++++++++++++++++++++ | 68% ~05s
|+++++++++++++++++++++++++++++++++++ | 69% ~05s
|+++++++++++++++++++++++++++++++++++ | 70% ~05s
|++++++++++++++++++++++++++++++++++++ | 71% ~05s
|++++++++++++++++++++++++++++++++++++ | 72% ~05s
|+++++++++++++++++++++++++++++++++++++ | 73% ~05s
|+++++++++++++++++++++++++++++++++++++ | 74% ~04s
|++++++++++++++++++++++++++++++++++++++ | 75% ~04s
|++++++++++++++++++++++++++++++++++++++ | 76% ~04s
|+++++++++++++++++++++++++++++++++++++++ | 77% ~04s
|+++++++++++++++++++++++++++++++++++++++ | 78% ~04s
|++++++++++++++++++++++++++++++++++++++++ | 79% ~04s
|++++++++++++++++++++++++++++++++++++++++ | 80% ~03s
|+++++++++++++++++++++++++++++++++++++++++ | 81% ~03s
|+++++++++++++++++++++++++++++++++++++++++ | 82% ~03s
|++++++++++++++++++++++++++++++++++++++++++ | 83% ~03s
|++++++++++++++++++++++++++++++++++++++++++ | 84% ~03s
|+++++++++++++++++++++++++++++++++++++++++++ | 85% ~03s
|+++++++++++++++++++++++++++++++++++++++++++ | 86% ~02s
|++++++++++++++++++++++++++++++++++++++++++++ | 87% ~02s
|++++++++++++++++++++++++++++++++++++++++++++ | 88% ~02s
|+++++++++++++++++++++++++++++++++++++++++++++ | 89% ~02s
|+++++++++++++++++++++++++++++++++++++++++++++ | 90% ~02s
|++++++++++++++++++++++++++++++++++++++++++++++ | 91% ~02s
|++++++++++++++++++++++++++++++++++++++++++++++ | 92% ~01s
|+++++++++++++++++++++++++++++++++++++++++++++++ | 93% ~01s
|+++++++++++++++++++++++++++++++++++++++++++++++ | 94% ~01s
|++++++++++++++++++++++++++++++++++++++++++++++++ | 95% ~01s
|++++++++++++++++++++++++++++++++++++++++++++++++ | 96% ~01s
|+++++++++++++++++++++++++++++++++++++++++++++++++ | 97% ~01s
|+++++++++++++++++++++++++++++++++++++++++++++++++ | 98% ~00s
|++++++++++++++++++++++++++++++++++++++++++++++++++| 99% ~00s
|++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed=17s
tcell.non_cd4.markers
FeaturePlot(non_cd4_subset, features = "Gcnt1", reduction = "non_cd4.umap")
FeaturePlot(non_cd4_subset, features = "Gzmb", reduction = "non_cd4.umap")
FeaturePlot(non_cd4_subset, features = "Spn", reduction = "non_cd4.umap")
FeaturePlot(non_cd4_subset, features = "Sell", reduction = "non_cd4.umap")
FeaturePlot(non_cd4_subset, features = "Prf1", reduction = "non_cd4.umap")
FeaturePlot(non_cd4_subset, features = "Spn", reduction = "non_cd4.umap")
FeaturePlot(non_cd4_subset, features = "Cxcr3", reduction = "non_cd4.umap")
FeaturePlot(non_cd4_subset, features = "Havcr2", reduction = "non_cd4.umap")
FeaturePlot(non_cd4_subset, features = "Ifng", reduction = "non_cd4.umap")
FeaturePlot(non_cd4_subset, features = "Icos", reduction = "non_cd4.umap")
FeaturePlot(non_cd4_subset, features = "Prdm1", reduction = "non_cd4.umap")
FeaturePlot(non_cd4_subset, features = "Tox", reduction = "non_cd4.umap")
FeaturePlot(non_cd4_subset, features = "Gzmk", reduction = "non_cd4.umap")
#FeaturePlot(non_cd4_subset, features = "Gbma", reduction = "non_cd4.umap") NOT FOUND
#FeaturePlot(non_cd4_subset, features = "Gzmh", reduction = "non_cd4.umap") NOT FOUND
FeaturePlot(non_cd4_subset, features = "Gzmb", reduction = "non_cd4.umap")
FeaturePlot(non_cd4_subset, features = "Lag3", reduction = "non_cd4.umap")
FeaturePlot(non_cd4_subset, features = "Tigit", reduction = "non_cd4.umap")
FeaturePlot(non_cd4_subset, features = "Irf4", reduction = "non_cd4.umap")
FeaturePlot(non_cd4_subset, features = "Tcf7", reduction = "non_cd4.umap")
FeaturePlot(non_cd4_subset, features = "Pdcd1", reduction = "non_cd4.umap")
FeaturePlot(non_cd4_subset, features = "Tnf", reduction = "non_cd4.umap")
FeaturePlot(non_cd4_subset, features = "Tbx21", reduction = "non_cd4.umap")
#Save the plots as PNGs
library(ggplot2)
genes <- c(“Gcnt1”, “Gzmb”, “Spn”, “Sell”, “Prf1”, “Cxcr3”, “Havcr2”, “Ifng”, “Icos”, “Prdm1”, “Tox”, “Gzmk”, “Lag3”, “Tigit”, “Irf4”, “Tcf7”, “Pdcd1”, “Tnf”, “Tbx21”)
for (gene in genes) { plot <- FeaturePlot(non_cd4_subset, features = gene, reduction = “non_cd4.umap”) ggsave(filename = paste0(“non_cd4_”, gene, “.png”), plot = plot, width = 6, height = 4, dpi = 300)
# From the list Dr. Krummey sent, find the differential expression of the genes of interest
# Look back to just NON-CD4 T cells to see if there is a difference (complete)